{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd # Pandas\n",
    "import pathlib # Built-in path manipulation library\n",
    "import requests\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = pathlib.Path('../data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/states_daily_4pm_et.csv\"\n",
    "df_raw = pd.read_csv(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "states = df_raw['state'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fix Michigan\n",
    "import bs4\n",
    "import numpy as np\n",
    "import requests\n",
    "response = requests.get('https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173_99207---,00.html')\n",
    "soup = bs4.BeautifulSoup(response.content)\n",
    "three_five_index = [x.find_all('td')[0].text for x in soup.find('table').find_all('tr') if x.find_all('td')].index('5-Mar')\n",
    "daily_counts = [int(x.find_all('td')[-1].text) for x in soup.find('table').find_all('tr') if x.find_all('td')][:-1]\n",
    "cum_counts = np.cumsum(daily_counts)\n",
    "cum_counts = cum_counts[three_five_index:]\n",
    "to_fix = df_raw.loc[df_raw['state']=='MI', 'date'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d')).sort_values().index\n",
    "df_raw.loc[to_fix[:len(cum_counts)], 'positive'] = cum_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recovery data for AK,AR,AZ,CO,DC,DE,GU,HI,IA,ID,KS,KY,LA,MD,ME,MI,MN,MS,MT,ND,NH,NJ,NM,NV,NY,OK,RI,SC,SD,TN,TX,UT,VA,VI,VT,WI,WV,WY\n",
      "No recovery data for AL,AS,CA,CT,FL,GA,IL,IN,MA,MO,MP,NC,NE,OH,OR,PA,PR,WA\n"
     ]
    }
   ],
   "source": [
    "good = []\n",
    "bad = []\n",
    "for state in states:  # For each country\n",
    "    source = df_raw[df_raw['state']==state] # Only the given state\n",
    "    if source['recovered'].sum() > 0: # If we have data in the downloaded file for that state\n",
    "        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',\n",
    "                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])\n",
    "        df['dates2'] = source['date'].apply(lambda x: datetime.strftime(datetime.strptime(str(x), '%Y%m%d'), '%m/%d/%y')) # Convert date format\n",
    "        df['cum_cases'] = source['positive'].values\n",
    "        df['cum_deaths'] = source['death'].values\n",
    "        df['cum_recover'] = source['recovered'].values\n",
    "        df = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int\n",
    "        df = df.sort_index() # Sort by date ascending\n",
    "        df[['new_cases', 'new_deaths', 'new_recover']] = \\\n",
    "            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()\n",
    "        if df['new_cases'].max() >= 5:\n",
    "            df['new_uninfected'] = df['new_recover'] + df['new_deaths']\n",
    "            df = df.fillna(0).astype(int)\n",
    "            df.to_csv(data_dir / ('covidtimeseries_US_%s.csv' % state))  # Overwrite old data\n",
    "            good.append(state)\n",
    "        else:\n",
    "            bad.append(state)\n",
    "    else:\n",
    "        bad.append(state)\n",
    "        \n",
    "print(\"Recovery data for %s\" % ','.join(good))\n",
    "print(\"No recovery data for %s\" % ','.join(bad))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['03/07/20', '03/08/20', '03/09/20', '03/10/20', '03/11/20', '03/12/20',\n",
       "       '03/13/20', '03/14/20', '03/15/20', '03/16/20', '03/17/20', '03/18/20',\n",
       "       '03/19/20', '03/20/20', '03/21/20', '03/22/20', '03/23/20', '03/24/20',\n",
       "       '03/25/20', '03/26/20', '03/27/20', '03/28/20', '03/29/20', '03/30/20',\n",
       "       '03/31/20', '04/01/20', '04/02/20', '04/03/20', '04/04/20', '04/05/20',\n",
       "       '04/06/20', '04/07/20', '04/08/20', '04/09/20', '04/10/20', '04/11/20',\n",
       "       '04/12/20', '04/13/20', '04/14/20', '04/15/20', '04/16/20', '04/17/20',\n",
       "       '04/18/20', '04/19/20', '04/20/20', '04/21/20', '04/22/20', '04/23/20',\n",
       "       '04/24/20', '04/25/20', '04/26/20', '04/27/20', '04/28/20', '04/29/20',\n",
       "       '04/30/20'],\n",
       "      dtype='object', name='dates2')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
